## data analysis '53 paper 
## train distances results

rm(list = ls())

library(readxl)
library(multiwayvcov)
library(lmtest)
library(Hmisc)

setwd("~/Dropbox/Current projects/1953/Replication archive")


## choose signal strength measure to employ
signal <- "lrm_4"

## should municipalities be dropped/recoded to account for possible jamming?
## jamming = km10, km12, km15; adjust = "drop", "min"
jamming <- ""
adjust <- ""

load(paste("1953data", signal, adjust, jamming, ".RData", sep = ""))



## rescale train distance to East Berlin so that coefficients are not too small
dat$traindist1 <- dat$traindist1/100


table1 <- matrix(NA,45,4)
rownames(table1) <- c(	"intercept",
						"county capital",
						"KVP base",
						"$%$ agriculture and forestry",
						"$%$ mining",
						"$%$ metalworking industry",
						"$%$ chemical industry",
						"$%$ woods and plastics",
						"$%$ consumer goods",
						"$%$ construction",
						"$%$ transportation",
						"$%$ commerce",
						"$%$ services",
						"log(population size)",
						"$%$ population male",
						"population density",
						"$%$ population change 1939--50",
						"$%$ population in same {\\it Land}",
						"$%$ 8 years of education",
						"$%$ 10 years of education",
						"$%$ 12 years of education",
						"$%$ more than 12 years of education",
						"$%$ trade school degree",
						"$%$ university degree",
						"$%$ protestant",
						"$%$ catholic",
						"$%$ agnostic",
						"living space per capita (m$^2$)", 
						"$%$ turnout '32",
						"$%$ invalid votes '32",
						"$%$ NSDAP '32",
						"$%$ SPD '32",
						"$%$ KPD '32",
						"$%$ Zentrum '32",
						"$%$ DNVP '32",
						"$%$ DVP '32",
						"RIAS signal strength",
						"RIAS signal strength$^2$",
						"RIAS signal strength$^3$",
						"train distance to East Berlin (100 km)",
						"train distance to East Berlin$^2$ (100 km)",
						"train distance to East Berlin$^3$ (100 km)",
						"Wald test RIAS $p$-value",
						"Wald test train distance $p$-value",
						"district fixed effects")

table1[45,1] <- "No"
table1[45,3] <- "Yes"



######################################################
## probit model 1 --- train distance; no fixed effects
######################################################
source("hl.R")
probit.out <- glm(protest ~

							county_capital +
							KVP +
							LandForstwirtschaft +
							Bergbau +
							Metallverarbeitung +
							Chemie +
							HolzKunstmassen +
							Verbrauchsgueter +
							Bauwirtschaft +
							Verkehrswesen +
							Handel +
							Dienstleistung +
							log(population) +
							prop_male +
							density +
							pop_change +
							prop_Land_same39 +
							prop_educ_8 +
							prop_educ_10 +
							prop_educ_12 +
							prop_educ_12plus +
							prop_educ_Fachschule +
							prop_educ_Hochschule +
							prop_evangelisch +
							prop_katholisch +
							prop_glaubenslos +
							living_space +
							turnout32 +
							invalid32 +
							NSDAP32 +
							SPD32 +
							KPD32 +
							Z32 +
							DNVP32 +
							DVP32 +

							signal + I(signal^2) + I(signal^3) +
							traindist1 +
							I(traindist1^2) +
							I(traindist1^3) +
							as.factor(size),
							x = TRUE, y = TRUE, model = TRUE,
							data = dat, family = binomial(link = "probit"))	

summary(probit.out)

## clustered standard errors
vcov.cl <- cluster.vcov(probit.out, dat$county)
temp <- coeftest(probit.out, vcov.cl)

table1[1:42,1:2] <- hl(temp[1:42,1:2])


## Wald test RIAS
V <- vcov.cl
Q <- matrix(0,3,nrow(V))
Q[1, which(probit.out$coef == probit.out$coef["signal"])] <- 1
Q[2, which(probit.out$coef == probit.out$coef["I(signal^2)"])] <- 1
Q[3, which(probit.out$coef == probit.out$coef["I(signal^3)"])] <- 1

r <- rep(0,3)

theta <- probit.out$coef

W <- t(Q %*% theta - r) %*% solve(Q %*% V %*% t(Q)) %*% (Q %*% theta - r)

## compare to Chi^2 distribution to get p-value
table1[43,1] <- round(1 - pchisq(W, df = nrow(Q)),3)


## Wald test train distance
V <- vcov.cl
Q <- matrix(0,3,nrow(V))
Q[1, which(probit.out$coef == probit.out$coef["traindist1"])] <- 1
Q[2, which(probit.out$coef == probit.out$coef["I(traindist1^2)"])] <- 1
Q[3, which(probit.out$coef == probit.out$coef["I(traindist1^3)"])] <- 1

r <- rep(0,3)

theta <- probit.out$coef

W <- t(Q %*% theta - r) %*% solve(Q %*% V %*% t(Q)) %*% (Q %*% theta - r)

## compare to Chi^2 distribution to get p-value
table1[44,1] <- round(1 - pchisq(W, df = nrow(Q)),3)



###################################################
## simulate effect of train distance to East Berlin
###################################################
library(MASS)
X.pred <- model.matrix(probit.out)
d <- seq(from = min(dat$traindist1), to = max(dat$traindist1), length.out = 20)
sims <- 10000
out <- matrix(NA,length(d),sims)
rownames(out) <- d


set.seed(123)
m <- 1; n <- 1
for(m in 1:length(d))	{

	X.pred[,"traindist1"] <- d[m]
	X.pred[,"I(traindist1^2)"] <- d[m]^2
	X.pred[,"I(traindist1^3)"] <- d[m]^3

		for(n in 1:sims)	{

			gamma.tilde <- mvrnorm(n = 1, mu = probit.out$coef, Sigma = vcov.cl)
			out[m,n] <- mean(pnorm(X.pred %*% gamma.tilde))


							}

	cat(m,"\n")

						}

out2 <- matrix(NA,3,length(d))
out2[1,] <- apply(out,1,mean)
out2[2,] <- apply(out,1,sort)[(sims*.025)+1,]
out2[3,] <- apply(out,1,sort)[(sims*.975),]


pdf("plot1train.pdf")
plot(d*100, out2[1,], ylim = c(0,.7), type = "l",
	xlab = "Train distance to East Berlin (km)",
	ylab = "Estimated probability of protest event", cex.lab = 1.2, cex.axis = 1.2,
	lwd = 2)

polygon(x = c(d*100, rev(d*100)), y = c(out2[2,], rev(out2[3,])), col = "#66666666", border = NA)
lines(d*100, out2[1,], lwd = 2)
dev.off()



############################################
## model 2 --- train distance; fixed effects
############################################						
probit.out <- glm(protest ~

							county_capital +
							KVP +
							LandForstwirtschaft +
							Bergbau +
							Metallverarbeitung +
							Chemie +
							HolzKunstmassen +
							Verbrauchsgueter +
							Bauwirtschaft +
							Verkehrswesen +
							Handel +
							Dienstleistung +
							log(population) +
							prop_male +
							density +
							pop_change +
							prop_Land_same39 +
							prop_educ_8 +
							prop_educ_10 +
							prop_educ_12 +
							prop_educ_12plus +
							prop_educ_Fachschule +
							prop_educ_Hochschule +
							prop_evangelisch +
							prop_katholisch +
							prop_glaubenslos +
							living_space +
							turnout32 +
							invalid32 +
							NSDAP32 +
							SPD32 +
							KPD32 +
							Z32 +
							DNVP32 +
							DVP32 +

							signal + I(signal^2) + I(signal^3) +
							traindist1 +
							I(traindist1^2) +
							I(traindist1^3) +
							as.factor(size) +
							as.factor(district),
							x = TRUE, y = TRUE, model = TRUE,
							data = dat, family = binomial(link = "probit"))	

summary(probit.out)

## clustered standard errors
vcov.cl <- cluster.vcov(probit.out, dat$county)
temp <- coeftest(probit.out, vcov.cl)

table1[1:42,3:4] <- hl(temp[1:42,1:2])



## Wald test RIAS
V <- vcov.cl
Q <- matrix(0,3,nrow(V))
Q[1, which(probit.out$coef == probit.out$coef["signal"])] <- 1
Q[2, which(probit.out$coef == probit.out$coef["I(signal^2)"])] <- 1
Q[3, which(probit.out$coef == probit.out$coef["I(signal^3)"])] <- 1

r <- rep(0,3)

theta <- probit.out$coef

W <- t(Q %*% theta - r) %*% solve(Q %*% V %*% t(Q)) %*% (Q %*% theta - r)

## compare to Chi^2 distribution to get p-value
table1[43,3] <- round(1 - pchisq(W, df = nrow(Q)),3)


## Wald test distance
V <- vcov.cl
Q <- matrix(0,3,nrow(V))
Q[1, which(probit.out$coef == probit.out$coef["traindist1"])] <- 1
Q[2, which(probit.out$coef == probit.out$coef["I(traindist1^2)"])] <- 1
Q[3, which(probit.out$coef == probit.out$coef["I(traindist1^3)"])] <- 1

r <- rep(0,3)

theta <- probit.out$coef

W <- t(Q %*% theta - r) %*% solve(Q %*% V %*% t(Q)) %*% (Q %*% theta - r)

## compare to Chi^2 distribution to get p-value
table1[44,3] <- round(1 - pchisq(W, df = nrow(Q)),3)



####################################
## simulate effect of train distance
####################################
library(MASS)
X.pred <- model.matrix(probit.out)
d <- seq(from = min(dat$traindist1), to = max(dat$traindist1), length.out = 20)
sims <- 10000
out <- matrix(NA,length(d),sims)
rownames(out) <- d


set.seed(123)
m <- 1; n <- 1
for(m in 1:length(d))	{

	X.pred[,"traindist1"] <- d[m]
	X.pred[,"I(traindist1^2)"] <- d[m]^2
	X.pred[,"I(traindist1^3)"] <- d[m]^3

		for(n in 1:sims)	{

			gamma.tilde <- mvrnorm(n = 1, mu = probit.out$coef, Sigma = vcov.cl)
			out[m,n] <- mean(pnorm(X.pred %*% gamma.tilde))


							}

	cat(m,"\n")

						}

out2 <- matrix(NA,3,length(d))
out2[1,] <- apply(out,1,mean)
out2[2,] <- apply(out,1,sort)[(sims*.025)+1,]
out2[3,] <- apply(out,1,sort)[(sims*.975),]


pdf("plot2train.pdf")
plot(d*100, out2[1,], ylim = c(0,.7), type = "l",
	xlab = "Train distance to East Berlin (km)",
	ylab = "Estimated probability of protest event", cex.lab = 1.2, cex.axis = 1.2,
	lwd = 2)

polygon(x = c(d*100, rev(d*100)), y = c(out2[2,], rev(out2[3,])),
	col = "#66666666", border = NA)
lines(d*100, out2[1,], lwd = 2)
dev.off()


## output table1
latex(table1, file = "")







